import gzip
import os.path
import pickle

from rdkit import Chem, RDConfig
from rdkit.Chem import Crippen

Crippen._Init()


def runIt(inFileName, outFileName, smiCol=0, maxMols=-1, delim=','):
  inF = gzip.open(inFileName, 'r')
  outF = open(outFileName, 'wb+')
  mols = []
  nDone = 0
  for line in inF.readlines():
    if line[0] != '#':
      splitL = line.strip().split(delim)
      smi = splitL[smiCol].strip()
      print(smi)
      mol = Chem.MolFromSmiles(smi)
      if mol:
        contribs = Crippen._GetAtomContribs(mol)
        pickle.dump((smi, contribs), outF)
      nDone += 1
      if maxMols > 0 and nDone >= maxMols:
        break
  outF.close()


if __name__ == '__main__':
  inFileName = os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', 'buildingblocks.smi.gz')
  outFileName = os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data',
                             'Crippen_contribs_regress.2.pkl')
  runIt(inFileName, outFileName, smiCol=1, delim='\t', maxMols=500)
